# coding=UTF-8
'''
Created on 15 Mar 2012

@author: R
'''

import re
from sys import argv
from base import processpage


def get_total_page_number(url,proxy):    
    pagetext = processpage(url,proxy)
    lastpage = re.compile("___________(\d+)_.htm\">尾页")
    totalpage_str = lastpage.search(pagetext).group(1)
    return int(totalpage_str)


def get_list(url,proxy):
    pagetext = processpage(url,proxy)
    strg = ""
    url_estate = re.compile("(http://.*?soufun.com/)\">(.*?)<\/a").findall(pagetext)
    for z in url_estate:
        strg = strg + z[1]+","+z[0]+"\n" 
    url_estate2 = re.compile("(http://.*?soufun.com/house/[^/]+?htm)\">(.*?)<\/a").findall(pagetext)
    for z in url_estate2:
        strg = strg + z[1]+","+z[0]+"\n"
    return strg


def run():
    proxy='0'
    urlhead="http://newhouse.wuhan.soufun.com/house/"
    urlcity="%CE%E4%BA%BA"                   #wuhan
    urlestate="______%D7%A1%D5%AC___________" #住宅 %B1%F0%CA%FB别墅  %D0%B4%D7%D6%C2%A5写字楼
    startpage = int(argv[2])
    urle = urlhead+urlcity+urlestate+"1_.htm"
    endpage = 12 #可手动更改结束页面 若设为get_total_page_number(urle,proxy)，则自动获取
    fh_putlist=open(argv[1],"a")
    for i in range(startpage,endpage):
        url = urlhead+urlcity+urlestate+str(i)+"_.htm"
        fh_putlist.write(get_list(url,proxy))

        
if __name__ == "__main__":
    argv.append("..\\lists.txt") #设定输出文件名（默认在执行代码目录上层目录下）
    argv.append('11')  #设定startpage，比如1就是从第1页开始
    run()

            